{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "Create a Simple Estimator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "from sklearn.datasets import load_breast_cancer\n",
    "\n",
    "bc = load_breast_cancer() \n",
    "\n",
    "new_feature_names = ['_'.join(ele.split()) for ele in bc.feature_names]\n",
    "\n",
    "X = pd.DataFrame(bc.data,columns = new_feature_names)\n",
    "y = bc.target"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=7, stratify = y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.base import BaseEstimator, ClassifierMixin\n",
    "from sklearn.linear_model import Ridge\n",
    " \n",
    "class RidgeClassifier(BaseEstimator, ClassifierMixin):\n",
    "    \n",
    "    \"\"\"A Classifier made from Ridge Regression\"\"\"\n",
    "    \n",
    "    def __init__(self,alpha=0):\n",
    "        self.alpha = alpha\n",
    "        \n",
    "    def fit(self, X, y = None):\n",
    "        #pass along the alpha parameter to the internal ridge estimator and perform a fit using it\n",
    "        self.ridge_regressor = Ridge(alpha = self.alpha) \n",
    "        self.ridge_regressor.fit(X, y)\n",
    "        \n",
    "        #save the seen class labels\n",
    "        self.class_labels = np.unique(y)\n",
    "        \n",
    "        return self\n",
    "    \n",
    "    def predict(self, X_test):\n",
    "        #store the results of the internal ridge regressor estimator\n",
    "        results = self.ridge_regressor.predict(X_test)\n",
    "        \n",
    "        #find the nearest class label\n",
    "        return np.array([self.class_labels[np.abs(self.class_labels - x).argmin()] for x in results])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.95744680851063835"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "r_classifier = RidgeClassifier(1.5)     \n",
    "r_classifier.fit(X_train, y_train)\n",
    "r_classifier.score(X_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\jdavila\\Anaconda27\\lib\\site-packages\\sklearn\\model_selection\\_search.py:747: DeprecationWarning: The grid_scores_ attribute was deprecated in version 0.18 in favor of the more elaborate cv_results_ attribute. The grid_scores_ attribute will not be available from 0.20\n",
      "  DeprecationWarning)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[mean: 0.94751, std: 0.00399, params: {'alpha': 0},\n",
       " mean: 0.95801, std: 0.01010, params: {'alpha': 0.5},\n",
       " mean: 0.96063, std: 0.01140, params: {'alpha': 1.0},\n",
       " mean: 0.96063, std: 0.01140, params: {'alpha': 1.5},\n",
       " mean: 0.96063, std: 0.01140, params: {'alpha': 2.0}]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.model_selection import GridSearchCV\n",
    "\n",
    "param_grid = {'alpha': [0,0.5,1.0,1.5,2.0]}\n",
    "gs_rc = GridSearchCV(RidgeClassifier(), param_grid, cv = 3).fit(X_train, y_train)\n",
    "\n",
    "gs_rc.grid_scores_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9521276595744681"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "\n",
    "lr = LogisticRegression()\n",
    "lr.fit(X_train,y_train)\n",
    "lr.score(X_test,y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\jdavila\\Anaconda27\\lib\\site-packages\\statsmodels\\compat\\pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.\n",
      "  from pandas.core import datetools\n"
     ]
    }
   ],
   "source": [
    "import statsmodels.api as sm\n",
    "import statsmodels.formula.api as smf\n",
    "\n",
    "from sklearn.base import BaseEstimator, ClassifierMixin\n",
    "\n",
    "\n",
    "from sklearn.base import BaseEstimator, ClassifierMixin\n",
    "from sklearn.linear_model import Ridge\n",
    "\n",
    "class GEEClassifier(BaseEstimator, ClassifierMixin):\n",
    "    \n",
    "    \"\"\"A Classifier made from statsmodels' Generalized Estimating Equations\n",
    "    \n",
    "    documentation available at: http://www.statsmodels.org/dev/gee.html\n",
    "       \n",
    "    \"\"\"\n",
    "    \n",
    "    def __init__(self,group_by_feature):\n",
    "        self.group_by_feature = group_by_feature\n",
    "          \n",
    "    def fit(self, X, y = None):\n",
    "        #Same settings as the documentation's example: \n",
    "        self.fam = sm.families.Poisson()\n",
    "        self.ind = sm.cov_struct.Exchangeable()\n",
    "        \n",
    "        #Auxiliary function: only used in this method within the class\n",
    "        def expand_X(X, y, desired_group): \n",
    "            X_plus = X.copy()\n",
    "            X_plus['y'] = y\n",
    "    \n",
    "            #roughly make ten groups\n",
    "            X_plus[desired_group + '_group'] = (X_plus[desired_group] * 10)//10\n",
    "    \n",
    "            return X_plus\n",
    "        \n",
    "        #save the seen class labels\n",
    "        self.class_labels = np.unique(y)\n",
    "        \n",
    "        dataframe_feature_names = X.columns\n",
    "        not_group_by_features = [x for x in dataframe_feature_names if x != self.group_by_feature]\n",
    "        \n",
    "        formula_in = 'y ~ ' + ' + '.join(not_group_by_features)\n",
    "        \n",
    "        data = expand_X(X,y,self.group_by_feature)\n",
    "        self.mod = smf.gee(formula_in, \n",
    "                           self.group_by_feature + \"_group\", \n",
    "                           data, \n",
    "                           cov_struct=self.ind, \n",
    "                           family=self.fam)\n",
    "        \n",
    "        self.res = self.mod.fit()\n",
    "        \n",
    "        return self\n",
    "    \n",
    "    def predict(self, X_test):\n",
    "        #store the results of the internal GEE regressor estimator\n",
    "        results = self.res.predict(X_test)\n",
    "        \n",
    "        #find the nearest class label\n",
    "        return np.array([self.class_labels[np.abs(self.class_labels - x).argmin()] for x in results])\n",
    "        \n",
    "    def print_fit_summary(self):\n",
    "        print res.summary()\n",
    "        return self"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.94680851063829785"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gee_classifier = GEEClassifier('mean_concavity')     \n",
    "gee_classifier.fit(X_train, y_train)\n",
    "gee_classifier.score(X_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "data_web_address = \"https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data\"\n",
    "\n",
    "column_names = ['pregnancy_x', \n",
    "                'plasma_con', \n",
    "                'blood_pressure', \n",
    "                'skin_mm', \n",
    "                'insulin', \n",
    "                'bmi', \n",
    "                'pedigree_func', \n",
    "                'age', \n",
    "                'target']\n",
    "\n",
    "feature_names = column_names[:-1]\n",
    "all_data = pd.read_csv(data_web_address , names=column_names)\n",
    "\n",
    "\n",
    "X = all_data[feature_names]\n",
    "y = all_data['target']\n",
    "\n",
    "\n",
    "from sklearn.model_selection import train_test_split\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7,stratify=y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.80519480519480524"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gee_classifier = GEEClassifier('blood_pressure')     \n",
    "gee_classifier.fit(X_train, y_train)\n",
    "gee_classifier.score(X_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.76623376623376627"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "r_classifier = RidgeClassifier()     \n",
    "r_classifier.fit(X_train, y_train)\n",
    "r_classifier.score(X_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pickle\n",
    "\n",
    "f = open('rc_inst.save','wb')\n",
    "pickle.dump(r_classifier, f, protocol = pickle.HIGHEST_PROTOCOL)\n",
    "f.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pickle\n",
    "\n",
    "f = open('rc_inst.save','rb')\n",
    "r_classifier = pickle.load(f)\n",
    "f.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.76623376623376627"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "r_classifier.fit(X_train, y_train)\n",
    "r_classifier.score(X_test, y_test)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
